This demo will run you through a complete dataset integration of T cell datasets using STACAS. We will be using the four following datasets:

To run the code on your machine, clone the STACAS.demo GitLab repository

First, check dependencies and install STACAS

if (!requireNamespace("remotes")) install.packages("remotes")
library(remotes)

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

if (!requireNamespace("TILPRED", quietly = TRUE))
  remotes::install_github("carmonalab/TILPRED")

if (!requireNamespace("STACAS", quietly = TRUE))
  remotes::install_github("carmonalab/STACAS")

Load required packages

library(Seurat)
library(STACAS)
library(TILPRED)
library(ggplot2)
library(reshape2)
library(grid)
library(gridExtra)
library(patchwork)

options(future.globals.maxSize= 8000*1024^2, future.seed=TRUE)

Download data from GEO, or load pre-computed dataset list

do_download = T
if (do_download) {
  source("./download.sets.STACAS.demo.R")
}  
raw.list <- readRDS("raw.list.demo.rds")
raw.list

Run TILPRED to get an idea of what kinds of cells are in each dataset

for (i in 1:length(raw.list)) {
   
   print(paste0("Cell state predictions for ", names(raw.list)[i], ":"))
   sce <- as.SingleCellExperiment(raw.list[[i]])
   sce.pred <- predictTilState(sce)

   raw.list[[i]] <- AddMetaData(raw.list[[i]], metadata=sce.pred$predictedState, col.name = "state.pred")
   raw.list[[i]] <- AddMetaData(raw.list[[i]], metadata=sce.pred$cycling, col.name = "cycling")
   raw.list[[i]]$Study <- names(raw.list)[[i]]
}
stateColorsPred2 <- c("#FF0000","#0000FF","#F8766D","#A58AFF","#00B6EB","#53B400",
                      "#E5ED14","#33333355")
names(stateColorsPred2) <- c("Treg","CD4T","CD8T_Naive","CD8T_EffectorMemory","CD8T_MemoryLike","CD8T_Exhausted",
                      "CD8T_unknown","Tcell_unknown") 

Remove non-Tcells and cycling cells

ref.list <- raw.list
for (i in 1:length(raw.list)) {
   ref.list[[i]] <- subset(ref.list[[i]], subset = cycling==FALSE)
   ref.list[[i]] <- subset(ref.list[[i]], subset = state.pred %in% c("Non-Tcell","unknown"), invert=T)
}

Control 1 - Combine datasets without alignment

Observe the large batch effects separating each dataset - there is no overlap between samples. Also note the heterogeneity of individual sets: the Cd8+ contains a relatively large fraction of naive-like and effector-memory cells; the Cd4+ sets are mainly composed (unsurpisingly) of Cd4 T cells and Tregs; the Cd4+/Cd8+ set contains the largest diversity of cells types.

ref.merged <- Reduce(merge,ref.list)

ndim=10
set.seed(1234)
ref.merged <- NormalizeData(ref.merged, verbose = FALSE)
ref.merged <- FindVariableFeatures(ref.merged, selection.method = "vst", nfeatures = 800, verbose = FALSE)

mito.genes <- grep(pattern = "^mt-", rownames(ref.merged), value = TRUE)
ribo.genes <- grep(pattern = "^Rp[ls]", rownames(ref.merged), value = TRUE)
    
ref.merged <- ScaleData(ref.merged)
ref.merged <- RunPCA(ref.merged, features = ref.merged@assays$RNA@var.features, ndims.print = 1:5, nfeatures.print = 5)

ref.merged <- RunUMAP(ref.merged, reduction = "pca", dims = 1:ndim, seed.use=123)

DimPlot(ref.merged, reduction = "umap", group.by = "Study") + ggtitle("UMAP by study")

DimPlot(ref.merged, reduction="umap", label=F, group.by = "state.pred", repel=T, cols=stateColorsPred2) + 
     ggtitle ("UMAP of predicted states")

Control 2 - Seurat alignment with default method

Here we apply Seurat 3 to find variable genes, calculate integration anchors, and finally integrate the data.

ref.list.default <- ref.list

all.genes <- row.names(ref.list.default[[1]])
for (i in 2:length(ref.list.default)) {
   all.genes <- intersect(all.genes, row.names(ref.list.default[[i]]))
}

var.genes.n <- 800
var.genes.integrated.n <- 500
ndim <- 10

mito.genes <- grep(pattern = "^mt-", rownames(ref.list[[i]]), value = TRUE)
ribo.genes <- grep(pattern = "^Rp[ls]", rownames(ref.list[[i]]), value = TRUE)
my.blocklist <- c(mito.genes, ribo.genes)

for (i in 1:length(ref.list.default)) {
    ref.list.default[[i]] <- NormalizeData(ref.list.default[[i]], verbose = FALSE)
    
    ref.list.default[[i]] <- FindVariableFeatures(ref.list.default[[i]],
                                                  nfeatures = var.genes.n, verbose = FALSE)
    
    VariableFeatures(ref.list.default[[i]]) <- setdiff(VariableFeatures(ref.list.default[[i]]), my.blocklist)
}

#Find integration anchors using CCA
ref.anchors.default <- FindIntegrationAnchors(ref.list.default, dims = 1:ndim, anchor.features=var.genes.integrated.n)

#And finally integrate
ref.integrated.default <- IntegrateData(anchorset = ref.anchors.default, dims = 1:ndim, features.to.integrate = all.genes)

ref.integrated.default@tools$Integration@sample.tree

After integration, we can examine how the transformation has affected the expression value of some important genes. Note how the distribution for Cd4 and Cd8a have been rescaled around a similar mean and variance - some important biological signal that distiguishes the different samples has been lost in the process.

Idents(ref.integrated.default) = "Study"
DefaultAssay(ref.integrated.default) <- "RNA"
VlnPlot(ref.integrated.default, features=c("Cd2","Cd3g","Cd8a","Cd4","Pdcd1","Tcf7"), pt.size = 0, ncol=6)

DefaultAssay(ref.integrated.default) <- "integrated"
VlnPlot(ref.integrated.default, features=c("Cd2","Cd3g","Cd8a","Cd4","Pdcd1","Tcf7"), pt.size = 0, ncol=6)

We can plot the integrated samples in a low-dimensional space (UMAP) and highlight some key markers genes. The UMAP seems to indicate the CCA over-corrected the gene expression values, removing biological signal together with batch effects.

set.seed(1234)

ndim=15

ref.integrated.default <- ScaleData(ref.integrated.default, verbose = TRUE)
ref.integrated.default <- RunPCA(ref.integrated.default,
                                 features = ref.integrated.default@assays$integrated@var.features)

ref.integrated.default <- RunUMAP(ref.integrated.default, reduction = "pca", dims = 1:ndim,
                                  seed.use=123)

DimPlot(ref.integrated.default, reduction = "umap", group.by = "Study") + ggtitle("UMAP by study")

#By predicted state
DimPlot(ref.integrated.default, reduction="umap", label=F, group.by = "state.pred",
        repel=T, cols=stateColorsPred2) + 
     ggtitle ("UMAP of predicted states")

DefaultAssay(ref.integrated.default) <- "RNA"
FeaturePlot(ref.integrated.default, reduction = "umap", features=c("Cd4","Cd8b1","Pdcd1","Tcf7"),
            pt.size = 0.01, slot = "data", ncol=2)

FeaturePlot(ref.integrated.default, reduction = "umap", features=c("Foxp3","Gzmb","Havcr2","Tox"),
            pt.size = 0.01, slot = "data", ncol=2)

3 - Integration with STACAS

The normalization and identification of variable features is the same as in standad Seurat pipelines.

var.genes.n <- 800
var.genes.integrated.n <- 500

mito.genes <- grep(pattern = "^mt-", rownames(ref.list[[i]]), value = TRUE)
ribo.genes <- grep(pattern = "^Rp[ls]", rownames(ref.list[[i]]), value = TRUE)
my.blocklist <- c(mito.genes, ribo.genes)

for (i in 1:length(ref.list)) {
    ref.list[[i]] <- NormalizeData(ref.list[[i]], verbose = FALSE)
    
    ref.list[[i]] <- FindVariableFeatures.STACAS(ref.list[[i]], nfeat = var.genes.n, genesBlockList = my.blocklist)
}

anchor.features <- SelectIntegrationFeatures(ref.list, nfeatures = var.genes.integrated.n)

Find integration anchors with pair-wise dataset distances. This function is based on FindAnchors in Seurat, but does not center or scale the data (i.e. does not impose zero mean and unit variance for variable genes) prior to running reciprocal PCA for anchor finding. It also returns pairwise distances betwen anchors, which can be used in the next step for anchor filtering.

ndim=10
ref.anchors <- FindAnchors.STACAS(ref.list, dims=1:ndim, anchor.features=anchor.features)

Plot the distance distribution between the anchors calculated in the previous step.

names <- names(ref.list)

plots <- PlotAnchors.STACAS(ref.anchors, obj.names=names)

wrap_plots(plots)

Now we have calculated and weighted a set of anchors for integration. Based on these anchors, STACAS can also determine an optimal integration tree (that is, the order in which dataset should be integrated), and we can then pass this tree together with the anchor set to the IntegrateData function in Seurat.

st1 <- SampleTree.STACAS(
  anchorset = ref.anchors,
  obj.names = names(ref.list)
  )

ref.integrated <- IntegrateData.STACAS(ref.anchors, dims=1:ndim, sample.tree=st1,
                                 features.to.integrate=all.genes)

We can check that integration did not remove all biological differences between data sets in term of gene expression.

Idents(ref.integrated) = "Study"
DefaultAssay(ref.integrated) <- "RNA"
VlnPlot(ref.integrated, features=c("Cd2","Cd3g","Cd8a","Cd4","Pdcd1","Tcf7"), pt.size = 0, ncol=6)

DefaultAssay(ref.integrated) <- "integrated"
VlnPlot(ref.integrated, features=c("Cd2","Cd3g","Cd8a","Cd4","Pdcd1","Tcf7"), pt.size = 0, ncol=6)

Finally, we can examine the integrated map made from the four samples. Cells from the same cell types (as predicted by TILPRED) cluster together, and important gene markers for T cells delineate clear sub-type areas and meaningful gradients.

set.seed(1234)

ndim=15

ref.integrated <- ScaleData(ref.integrated, verbose = TRUE)
ref.integrated <- RunPCA(ref.integrated, features = ref.integrated@assays$integrated@var.features,
                         ndims.print = 1:5, nfeatures.print = 5)

ndim=10  #how many PCA components to retain

ref.integrated <- RunUMAP(ref.integrated, reduction = "pca", dims = 1:ndim, seed.use=123, n.neighbors = 30, min.dist=0.3)

DimPlot(ref.integrated, reduction = "umap", group.by = "Study") + ggtitle("UMAP by study")

#By predicted state
DimPlot(ref.integrated, reduction="umap", label=F, group.by = "state.pred", repel=T, cols=stateColorsPred2) +
  ggtitle ("UMAP of predicted states")

DefaultAssay(ref.integrated) <- "RNA"
FeaturePlot(ref.integrated, reduction = "umap", features=c("Cd4","Cd8b1","Pdcd1","Tcf7"),
            pt.size = 0.01, slot = "data", ncol=2)

FeaturePlot(ref.integrated, reduction = "umap", features=c("Foxp3","Gzmb","Havcr2","Tox"),
            pt.size = 0.01, slot = "data", ncol=2)

While Seurat integration tends to overcorrect batch effects, STACAS limits the overlap between CD4+ and CD8+ T cell populations. Therefore, STACAS integration is useful to mitigate batch effects between datasets, while preserving biological variability.

Further reading

The STACAS package and installation instructions are available at: STACAS package

The code for this demo can be found on GitHub

See a further example of STACAS integration, including its application as a semi-supervised methods, in this demo on PBMC data.